## steps
# extract and transform data on debtor and instrument level
# import final debtors dataset
# inner join between the two datasets to remove redundant debtors and import size flag + nace_code

# final datasets
import numpy as np
import pandas as pd
import pyodbc
import os
from os import listdir


os.chdir(r'P:\ECB business areas\DGR\Staff\papouts\Research\Active\Firm Heterogeneities\Final datasets')
path_orbis = r'P:\ECB business areas\DGR\Staff\papouts\Research\Active\Firm Heterogeneities\Final datasets\Orbis'
# for NUTS
path_data = r'P:\ECB business areas\DGR\Staff\papouts\Research\Active\Firm Heterogeneities\Final datasets\NUTS output data'
if not os.path.isdir(path_data):
    os.makedirs(path_data)


# debtor level
sql_conn = pyodbc.connect('DSN=DISC DP Impala 64bit',autocommit=True)

query = """
SELECT DISTINCT *

FROM (

SELECT           cntry_dbtr, 
                 dbtr_id,
                 nace_code,
                 SUM(outst_amnt_shr_sq) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS HHI,
                 COUNT(crdtr_id) OVER (PARTITION BY cntry_dbtr, dbtr_id) as nmbr_crdtrs,
                 mb_shr,
                 outst_amnt_ttl as ONA_dbtr
                 
FROM (
 SELECT DISTINCT cntry_dbtr, 
                 dbtr_id, 
                 nace_code,
                 outst_amnt_ttl, 
                 outst_amnt_shr_sq,
                 crdtr_id,
                 outst_amnt_per_crdtr,
                 outst_amnt_ttl_max,
                 outst_amnt_ttl_max/outst_amnt_ttl AS mb_shr                
 
 FROM (
 SELECT          cntry_dbtr, 
                 dbtr_id, 
                 crdtr_id,
                 cntrct_id,
                 instrmnt_id,
                 nace_code,
                 MAX(CASE WHEN default_aggr_dbtr_lvl = -99 AND default_dbtr_lvl = -99 THEN -99
                      WHEN (default_aggr_dbtr_lvl = 0 AND default_dbtr_lvl = -99) OR
                           (default_aggr_dbtr_lvl = 0 AND default_dbtr_lvl = 0) OR
                           (default_aggr_dbtr_lvl = -99 AND default_dbtr_lvl = 0) THEN 0
                      ELSE 1 END) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS dflt_stts_debtor,
                 outst_amnt_per_crdtr,
                 outst_amnt_ttl,
                 (outst_amnt_per_crdtr/outst_amnt_ttl)*(outst_amnt_per_crdtr/outst_amnt_ttl) AS outst_amnt_shr_sq,
                 MAX(outst_amnt_per_crdtr) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS outst_amnt_ttl_max
                 
 FROM (
 SELECT          cntry_dbtr,
                 dbtr_id, 
                 crdtr_id,
                 cntrct_id,
                 instrmnt_id,
                 t2.nace_code,
                 otstndng_nmnl_amnt_cv_c,
                 MAX(dflt_stts_instr_flag) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS default_aggr_dbtr_lvl,
                 MAX(dflt_stts_debtor_flag) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS default_dbtr_lvl,
                 SUM(otstndng_nmnl_amnt_cv_c) OVER (PARTITION BY cntry_dbtr, dbtr_id, crdtr_id) AS outst_amnt_per_crdtr,
                 SUM(otstndng_nmnl_amnt_cv_c) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS outst_amnt_ttl

FROM            lab_org_dgr_fir.ac_instr_201912_kk t1
LEFT JOIN       lab_org_dgr_fir.ac_dbtr_201912_kk t2
ON              t1.dbtr_id=t2.entty_riad_id

WHERE           nvl(otstndng_nmnl_amnt_cv_c,0) + nvl(off_blnc_sht_amnt_cv,0) + nvl(cmmtmnt_incptn_cv,0) > 0                              
                AND prjct_fnnc_ln != 1
                
          ) tbl
 
      
                                ) tbl2
                                                
WHERE    dflt_stts_debtor = 0 

) tbl3
                                                
WHERE mb_shr IS NOT NULL   

) tbl4               

                """
                
dd_dbtr = pd.read_sql(query, sql_conn)
sql_conn.close()

# check
dd_dbtr.isna().sum()
ch = dd_dbtr.describe()

# import final debtors dataset
dbtr_f = pd.read_stata('AC_final_debtors_f.dta')

# merge datasets and take size flag
dd_merged = dd_dbtr.merge(dbtr_f[['cntry', 'entty_riad_id','sz_f']], how='left', left_on=['cntry_dbtr', 'dbtr_id'], right_on=['cntry', 'entty_riad_id'])
dd_merged.isna().sum()

# final dataset
dd_final = dd_merged[dd_merged.sz_f.isna() == False][['cntry_dbtr', 'dbtr_id', 'sz_f', 'nace_code', 'hhi', 'nmbr_crdtrs', 'mb_shr', 'ona_dbtr']].copy()

# export dataset
dd_final.to_stata('dbtr_lvl_dt_f.dta', write_index=False)

del dd_merged, dd_final, dd_dbtr

# instrument level
# interest rate, PD, maturities
# NULL values are ignored when summed

sql_conn = pyodbc.connect('DSN=DISC DP Impala 64bit',autocommit=True)

query = """

 SELECT DISTINCT cntry_dbtr, 
                 dbtr_id, 
                 nace_code,
                 typ_instr_corr,
                 outst_amnt_byinstr,
                 prtctn_amnt_byinstr,
                 ona_intrate_c_ttl,
                 ona_orgnlmtrty_c_ttl,
                 ona_pddbtr_c_ttl,
                 w_intrate_c_ttl/ona_intrate_c_ttl AS wir_byloantype,
                 w_orgnlmtrty_c_ttl/ona_orgnlmtrty_c_ttl AS mtrty_byloantype,
                 w_pddbtr_c_ttl/ona_pddbtr_c_ttl AS pddbtr_byloantype
 
 FROM (
 SELECT          cntry_dbtr, 
                 dbtr_id, 
                 cntrct_id,
                 instrmnt_id,
                 typ_instr_corr,
                 nace_code,
                 otstndng_nmnl_amnt_cv_c,
                 prtctn_allctd_vl_inst_cv_c,
                 MAX(CASE WHEN default_aggr_dbtr_lvl = -99 AND default_dbtr_lvl = -99 THEN -99
                      WHEN (default_aggr_dbtr_lvl = 0 AND default_dbtr_lvl = -99) OR
                           (default_aggr_dbtr_lvl = 0 AND default_dbtr_lvl = 0) OR
                           (default_aggr_dbtr_lvl = -99 AND default_dbtr_lvl = 0) THEN 0
                      ELSE 1 END) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS dflt_stts_debtor,
                 outst_amnt_byinstr,
                 prtctn_amnt_byinstr,
                 SUM(ona_intrate_c) OVER (PARTITION BY cntry_dbtr, dbtr_id, typ_instr_corr) AS ona_intrate_c_ttl,
                 SUM(ona_orgnlmtrty_c) OVER (PARTITION BY cntry_dbtr, dbtr_id, typ_instr_corr) AS ona_orgnlmtrty_c_ttl,
                 SUM(ona_pddbtr_c) OVER (PARTITION BY cntry_dbtr, dbtr_id, typ_instr_corr) AS ona_pddbtr_c_ttl,
                 SUM(annlsd_agrd_rt_c*ona_intrate_c) OVER (PARTITION BY cntry_dbtr, dbtr_id, typ_instr_corr) AS w_intrate_c_ttl,
                 SUM(orgnl_mtrty_c_y*ona_orgnlmtrty_c) OVER (PARTITION BY cntry_dbtr, dbtr_id, typ_instr_corr) AS w_orgnlmtrty_c_ttl,
                 SUM(pd_dbtr_c*ona_pddbtr_c) OVER (PARTITION BY cntry_dbtr, dbtr_id, typ_instr_corr) AS w_pddbtr_c_ttl
                 
                 
 FROM (
 SELECT          cntry_dbtr,
                 dbtr_id, 
                 cntrct_id,
                 instrmnt_id,
                 typ_instr_corr,
                 t2.nace_code,
                 otstndng_nmnl_amnt_cv_c,
                 prtctn_allctd_vl_inst_cv_c,
                 CASE WHEN (annlsd_agrd_rt < 0 OR annlsd_agrd_rt > 0.3) THEN NULL ELSE annlsd_agrd_rt END AS annlsd_agrd_rt_c,
                 CASE WHEN (annlsd_agrd_rt < 0 OR annlsd_agrd_rt > 0.3) THEN NULL ELSE otstndng_nmnl_amnt_cv_c END AS ona_intrate_c,
                 CASE WHEN orgnl_mtrty_c/365 > 30 THEN 30 ELSE orgnl_mtrty_c/365 END AS orgnl_mtrty_c_y,
                 CASE WHEN orgnl_mtrty_c IS NULL THEN NULL ELSE otstndng_nmnl_amnt_cv_c END AS ona_orgnlmtrty_c,
                 CASE WHEN (pd_dbtr > 0 AND pd_dbtr <= 1) THEN pd_dbtr ELSE NULL END AS pd_dbtr_c,
                 CASE WHEN (pd_dbtr > 0 AND pd_dbtr <= 1) THEN otstndng_nmnl_amnt_cv_c ELSE NULL END AS ona_pddbtr_c,                
                 MAX(dflt_stts_instr_flag) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS default_aggr_dbtr_lvl,
                 MAX(dflt_stts_debtor_flag) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS default_dbtr_lvl,
                 SUM(otstndng_nmnl_amnt_cv_c) OVER (PARTITION BY cntry_dbtr, dbtr_id, typ_instr_corr) AS outst_amnt_byinstr,
                 SUM(prtctn_allctd_vl_inst_cv_c) OVER (PARTITION BY cntry_dbtr, dbtr_id, typ_instr_corr) AS prtctn_amnt_byinstr
                 

FROM            lab_org_dgr_fir.ac_instr_201912_kk t1
LEFT JOIN       lab_org_dgr_fir.ac_dbtr_201912_kk t2
ON              t1.dbtr_id=t2.entty_riad_id

WHERE           nvl(otstndng_nmnl_amnt_cv_c,0) + nvl(off_blnc_sht_amnt_cv,0) + nvl(cmmtmnt_incptn_cv,0) > 0                              
                AND prjct_fnnc_ln != 1
      
          ) tbl
 
      
                                ) tbl2
                                                
WHERE    dflt_stts_debtor = 0 

                                                
                """
                
dd_instr = pd.read_sql(query, sql_conn)
sql_conn.close()

# check
dd_instr.isna().sum()
ch = dd_instr.describe()

# merge datasets and take size flag and weight
dd_merged = dd_instr.merge(dbtr_f[['cntry', 'entty_riad_id','sz_f', 'blnc_sht_ttl_m_final']], how='left', left_on=['cntry_dbtr', 'dbtr_id'], right_on=['cntry', 'entty_riad_id'])
dd_merged.isna().sum()
dd_merged[dd_merged.sz_f.isna() == True]['dbtr_id'].nunique()
# why are the missing debtors more? 
# -> the debtors with missing mb_shr were excluded already and this is why the number for debtor dataset is lower

# final dataset
dd_final = dd_merged[dd_merged.sz_f.isna() == False].drop(columns=['cntry', 'entty_riad_id'])
dd_final.isna().sum()

# export dataset
dd_final.to_stata('instr_lvl_dt.dta', write_index=False)

del dd_merged, dd_final, dd_instr


#### instrument level from main bank
sql_conn = pyodbc.connect('DSN=DISC DP Impala 64bit',autocommit=True)

query = """

 SELECT DISTINCT cntry_dbtr, 
                 dbtr_id,
                 crdtr_id,
                 nace_code,
                 typ_instr_corr,
                 outst_amnt_byinstr,
                 outst_amnt_per_crdtr,
                 outst_amnt_crdtr_loan,
                 outst_amnt_ttl, 
                 outst_amnt_ttl_max,
                 outst_amnt_ttl_max/outst_amnt_ttl AS mb_shr,
                 CASE WHEN (outst_amnt_per_crdtr - outst_amnt_ttl_max) = 0 THEN 1 ELSE 0 END AS mb_flag,
                 CASE WHEN (outst_amnt_per_crdtr - outst_amnt_ttl_max) = 0 THEN outst_amnt_crdtr_loan ELSE NULL END AS mb_ONA_loan
                 
 
 FROM (
 SELECT          cntry_dbtr, 
                 dbtr_id, 
                 crdtr_id,
                 cntrct_id,
                 instrmnt_id,
                 typ_instr_corr,
                 nace_code,
                 otstndng_nmnl_amnt_cv_c,
                 MAX(CASE WHEN default_aggr_dbtr_lvl = -99 AND default_dbtr_lvl = -99 THEN -99
                      WHEN (default_aggr_dbtr_lvl = 0 AND default_dbtr_lvl = -99) OR
                           (default_aggr_dbtr_lvl = 0 AND default_dbtr_lvl = 0) OR
                           (default_aggr_dbtr_lvl = -99 AND default_dbtr_lvl = 0) THEN 0
                      ELSE 1 END) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS dflt_stts_debtor,
                 outst_amnt_byinstr,
                 outst_amnt_per_crdtr,
                 outst_amnt_crdtr_loan,
                 outst_amnt_ttl,
                 MAX(outst_amnt_per_crdtr) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS outst_amnt_ttl_max

                 
                 
 FROM (
 SELECT          cntry_dbtr,
                 dbtr_id, 
                 crdtr_id,
                 cntrct_id,
                 instrmnt_id,
                 typ_instr_corr,
                 t2.nace_code,
                 otstndng_nmnl_amnt_cv_c,
                 MAX(dflt_stts_instr_flag) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS default_aggr_dbtr_lvl,
                 MAX(dflt_stts_debtor_flag) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS default_dbtr_lvl,
                 SUM(otstndng_nmnl_amnt_cv_c) OVER (PARTITION BY cntry_dbtr, dbtr_id, typ_instr_corr) AS outst_amnt_byinstr,
                 SUM(otstndng_nmnl_amnt_cv_c) OVER (PARTITION BY cntry_dbtr, dbtr_id,  crdtr_id) AS outst_amnt_per_crdtr,
                 SUM(otstndng_nmnl_amnt_cv_c) OVER (PARTITION BY cntry_dbtr, dbtr_id,  crdtr_id, typ_instr_corr) AS outst_amnt_crdtr_loan,               
                 SUM(otstndng_nmnl_amnt_cv_c) OVER (PARTITION BY cntry_dbtr, dbtr_id ) AS outst_amnt_ttl
                 

FROM            lab_org_dgr_fir.ac_instr_201912_kk t1
LEFT JOIN       lab_org_dgr_fir.ac_dbtr_201912_kk t2
ON              t1.dbtr_id=t2.entty_riad_id

WHERE           nvl(otstndng_nmnl_amnt_cv_c,0) + nvl(off_blnc_sht_amnt_cv,0) + nvl(cmmtmnt_incptn_cv,0) > 0                              
                AND prjct_fnnc_ln != 1
      
          ) tbl
 
      
                                ) tbl2
                                                
WHERE    dflt_stts_debtor = 0 

                                                
                """
                
dd_instr = pd.read_sql(query, sql_conn)
sql_conn.close()

# check
dd_instr.isna().sum()
ch = dd_instr.describe()

# merge datasets and take size flag and weight
dd_merged = dd_instr.merge(dbtr_f[['cntry', 'entty_riad_id','sz_f', 'blnc_sht_ttl_m_final']], how='left', left_on=['cntry_dbtr', 'dbtr_id'], right_on=['cntry', 'entty_riad_id'])
dd_merged.isna().sum()
dd_merged[dd_merged.sz_f.isna() == True]['dbtr_id'].nunique()

# final dataset
dd_final = dd_merged[dd_merged.sz_f.isna() == False].drop(columns=['cntry', 'entty_riad_id'])
dd_final.isna().sum()

# export dataset
# this does not include the crdtr_id and loan by each creditor
#dd_final.to_stata('instr_lvl_mb_dt.dta', write_index=False)
# WE NEED THE AMOUNTS BY CREDITORS AND TYPE OF INSTRUMENT
dd_final.to_stata('instr_lvl_mb_ext_dt.dta', write_index=False)

del dd_merged, dd_final, dd_instr


# creditor level
# we need the total ONA of a bank in a region (debtor's region) -> outside the query

sql_conn = pyodbc.connect('DSN=DISC DP Impala 64bit',autocommit=True)

query = """


 SELECT DISTINCT cntry_dbtr, 
                 dbtr_id,
                 nace_code,
                 outst_amnt_ttl,
                 crdtr_id,
                 outst_amnt_per_crdtr,
                 nuts1_dbtr,
                 nuts2_dbtr,
                 nuts3_dbtr,
                 pstl_cd                
 
 FROM (
 SELECT          cntry_dbtr, 
                 dbtr_id, 
                 crdtr_id,
                 cntrct_id,
                 instrmnt_id,
                 nace_code,
                 nuts1_dbtr,
                 nuts2_dbtr,
                 nuts3_dbtr,
                 pstl_cd,
                 MAX(CASE WHEN default_aggr_dbtr_lvl = -99 AND default_dbtr_lvl = -99 THEN -99
                      WHEN (default_aggr_dbtr_lvl = 0 AND default_dbtr_lvl = -99) OR
                           (default_aggr_dbtr_lvl = 0 AND default_dbtr_lvl = 0) OR
                           (default_aggr_dbtr_lvl = -99 AND default_dbtr_lvl = 0) THEN 0
                      ELSE 1 END) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS dflt_stts_debtor,
                 outst_amnt_per_crdtr,
                 outst_amnt_ttl
                 
 FROM (
 SELECT          cntry_dbtr,
                 dbtr_id, 
                 crdtr_id,
                 cntrct_id,
                 instrmnt_id,
                 t2.nace_code,
                 t2.pstl_cd,
                 SUBSTR(t2.trrtrl_unt, 1, 3) AS nuts1_dbtr,
                 SUBSTR(t2.trrtrl_unt, 1, 4) AS nuts2_dbtr,
                 t2.trrtrl_unt AS nuts3_dbtr,
                 otstndng_nmnl_amnt_cv_c,
                 MAX(dflt_stts_instr_flag) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS default_aggr_dbtr_lvl,
                 MAX(dflt_stts_debtor_flag) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS default_dbtr_lvl,
                 SUM(otstndng_nmnl_amnt_cv_c) OVER (PARTITION BY cntry_dbtr, dbtr_id, crdtr_id) AS outst_amnt_per_crdtr,
                 SUM(otstndng_nmnl_amnt_cv_c) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS outst_amnt_ttl

FROM            lab_org_dgr_fir.ac_instr_201912_kk t1
LEFT JOIN       lab_org_dgr_fir.ac_dbtr_201912_kk t2
ON              t1.dbtr_id=t2.entty_riad_id

WHERE           nvl(otstndng_nmnl_amnt_cv_c,0) + nvl(off_blnc_sht_amnt_cv,0) + nvl(cmmtmnt_incptn_cv,0) > 0                              
                AND prjct_fnnc_ln != 1
      
          ) tbl
 
      
                                ) tbl2
                                                
WHERE    dflt_stts_debtor = 0 

             

                """
                
dd_crdtr = pd.read_sql(query, sql_conn)
sql_conn.close()

# check
dd_crdtr.isna().sum()
dd_crdtr.nunique()
# import final debtors
AC_final = pd.read_stata(r'dbtr_lvl_dt.dta')
dd_merged = AC_final[['cntry_dbtr', 'dbtr_id', 'sz_f', 'nace_code', 'ona_dbtr']].merge(dd_crdtr, how='left', on=['cntry_dbtr', 'dbtr_id', 'nace_code'])
dd_merged.nuts1_dbtr.str[0:2].unique()
# EL is GR - change
dd_merged['nuts1_dbtr'] = dd_merged['nuts1_dbtr'].str.replace("EL", "GR")
dd_merged['nuts2_dbtr'] = dd_merged['nuts2_dbtr'].str.replace("EL", "GR")
dd_merged['nuts3_dbtr'] = dd_merged['nuts3_dbtr'].str.replace("EL", "GR")

# import EU Commission files
codes_all = []
path_codes = r'P:\ECB business areas\DGR\Databases and Programme files\DGR Data Management Centre\Data\AnaCredit\AdHoc\Niklas\Firm Hetereogeneities\Final datasets\NUTS and ZIP'
for f in listdir(path_codes):
    if f[7:9] in ['AT','BE','CY','DE','EE','ES','FI','FR','EL','IE','IT','LT','LU','LV','MT','NL','PT','SI','SK']:
        print (f)
        dt = pd.read_csv(path_codes + '\\' + f, sep=';')
        codes_all.append(dt)

# concatenate       
codes_eu = pd.concat(codes_all)    
# clean
codes_eu['NUTS3'] = codes_eu['NUTS3'].str.replace("'", "")
codes_eu['CODE'] = codes_eu['CODE'].str.replace("'", "")
# change EL to GR 
codes_eu['NUTS3'] = codes_eu['NUTS3'].str.replace("EL", "GR")
# remove spaces and dashes from the table for the merge
codes_eu['CODE'] = codes_eu['CODE'].str.replace(" ", "")
codes_eu['CODE'] = codes_eu['CODE'].str.replace("-", "")


# TWO GROUPS OF ISSUES
# 1 - MISSING NUTS CODE IN THE MAIN DATASET
# 2 - OUTDATED NUTS CODES IN RIAD

## MISSING NUTS CODE IN THE MAIN DATASET
# check merged dataset
ch = dd_merged.describe()
ch1 = dd_merged[dd_merged.crdtr_id == -99].copy()
# FR, DE, ES, IE, IT, PT
ch1.groupby(['cntry_dbtr'])['ona_crdtr_nuts1'].agg([np.sum, len])
# GROUP 1 - missing nuts code (-99 + NAs)
ch2 = dd_merged[(dd_merged.nuts1_dbtr == '-99') | (dd_merged.nuts1_dbtr.isna() == True)].copy()
# including (dd_merged.nuts1_dbtr == '') does not make a difference, there are no such observations
ch2.groupby(['cntry_dbtr'])['ona_crdtr_nuts1'].agg([np.sum, len])
ch2.groupby(['cntry_dbtr'])['dbtr_id'].nunique() # distinct firms
 # remove spaces and dashes from the table for the merge
ch2['pstl_cd'] = ch2['pstl_cd'].str.replace(" ", "")
ch2['pstl_cd'] = ch2['pstl_cd'].str.replace("-", "")
# merge
ch2 = ch2.merge(codes_eu, how='left', left_on=['pstl_cd'], right_on=['CODE'])
ch2.isna().sum()
# match the country because otherwise we have duplicates
ch2_cl = ch2[ch2['cntry_dbtr'] == ch2['NUTS3'].str[0:2]].copy()
# this removes also missings
ch2_cl.isna().sum()
ch2_cl.pstl_cd.describe()
cc = ch2_cl.pstl_cd.drop_duplicates()

# merge with the main dataset
dd_merged = dd_merged.merge(ch2_cl[['cntry_dbtr','dbtr_id', 'NUTS3']].drop_duplicates(), how='left', on=['cntry_dbtr','dbtr_id'])

# combine information from the two columns (NUTS3 and nuts3_dbtr) into a new one
dd_merged.isna().sum()
dd_merged['nuts3_dbtr'].describe()
# we have both NAs and -99 entries
dd_merged['nuts3_dbtr_final'] = dd_merged['nuts3_dbtr']
dd_merged.loc[dd_merged.nuts3_dbtr == '-99', 'nuts3_dbtr_final'] = dd_merged['NUTS3']
dd_merged.loc[dd_merged.nuts3_dbtr.isna() == True, 'nuts3_dbtr_final'] = dd_merged['NUTS3']
# check
ch3 = dd_merged[dd_merged.nuts3_dbtr_final == '-99'].copy() # None
ch3 = dd_merged[dd_merged.nuts3_dbtr_final.isna() == True].copy() # part of these do not have a nace code
ch3_1 = dd_merged[(dd_merged.nuts3_dbtr_final.isna() == True) & (dd_merged.nace_code == '')].copy()
ch3_2 = dd_merged[dd_merged.nace_code == ''].copy()
# 55 829 -> these should be removed from the analysis
# what is the case of the other missings?
ch4 = dd_merged[(dd_merged.nuts3_dbtr_final.isna() == True) & (dd_merged.nace_code != '')].copy()
# for some cases the postal code is missing - around half of the cases
ch4_1 = ch4[ch4.pstl_cd.isna() == True]
# these cannot be matched and should be removed?
ch4_1.groupby('cntry_dbtr')['dbtr_id'].nunique()
ch4_1.groupby('cntry_dbtr')['outst_amnt_per_crdtr'].agg([np.sum, len])
# and those that have a postal code?
ch4_2 = ch4[ch4.pstl_cd.isna() == False]
ch4_2.groupby('cntry_dbtr')['dbtr_id'].nunique()
ch4_2.groupby('cntry_dbtr')['outst_amnt_per_crdtr'].agg([np.sum, len])

### further manual checks by country

# LU - USE
ch2[ch2.dbtr_id == 18150075]
dd_merged[dd_merged.dbtr_id == 18150075]
lu_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'LU']
# it seems that there are some differences in postal codes between RIAD and the external dataset
# also, the NUTS3 classification for LU is the same regardless of the postal code
# replace this manually in the data
lu_dt = ch4_2[ch4_2.cntry_dbtr == 'LU'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
lu_dt['NUTS3'] = lu_pstl.NUTS3.unique()[0]

# ES !!!!!! - USE
ch2[ch2.dbtr_id == 14470241]
es_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'ES']
# match by only the first two digits from the postal code?
es_dt = ch4_2[ch4_2.cntry_dbtr == 'ES'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
es_dt['trim'] = es_dt.pstl_cd.str[0:2]
es_dt.groupby(['trim'])['dbtr_id'].count()
# 10 000 debtors will have to be removed because they cannot be matched
es_pstl['trim'] = es_pstl.CODE.str[0:2]
es_dt = es_dt.merge(es_pstl[['NUTS3', 'trim']].drop_duplicates(), how='left', on=['trim'])
# duplications occur - exclude these ES cases?
es_dupl = es_dt.groupby(['dbtr_id'])['cntry_dbtr'].count().reset_index()
es_dupl.rename(columns={'cntry_dbtr':'dupl'}, inplace=True)
es_dt = es_dt.merge(es_dupl, how='left', on=['dbtr_id'])

# FR !!!!!! - USE
ch2[ch2.dbtr_id == 15396815]
fr_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'FR']
fr_pstl['trim'] = fr_pstl.CODE.str[0:2]
tt = fr_pstl[['NUTS3', 'trim']].drop_duplicates()
# match by first two and then manually clean duplicates?
fr_dt = ch4_2[ch4_2.cntry_dbtr == 'FR'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
fr_dt['trim'] = fr_dt.pstl_cd.str[0:2]
fr_dt.groupby(['trim'])['dbtr_id'].count()
fr_dt = fr_dt.merge(fr_pstl[['NUTS3', 'trim']].drop_duplicates(), how='left', on=['trim'])
# remove duplications?
fr_dupl = fr_dt.groupby(['dbtr_id'])['cntry_dbtr'].count().reset_index()
fr_dupl.rename(columns={'cntry_dbtr':'dupl'}, inplace=True)
fr_dt = fr_dt.merge(fr_dupl, how='left', on=['dbtr_id'])


# CY - USE
cy_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'CY']
# only 1 code available -> directly substitute
cy_dt = ch4_2[ch4_2.cntry_dbtr == 'CY'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
cy_dt['NUTS3'] = cy_pstl.NUTS3.unique()[0]


# IE - USE
ie_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'IE']
ie_dt = ch4_2[ch4_2.cntry_dbtr == 'IE'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
# codes in the dataset should be trimmed - the first 3 characters and then matched
ie_dt['trim'] = ie_dt.pstl_cd.str[0:3]
ie_dt = ie_dt.merge(ie_pstl, how='left', left_on=['trim'], right_on=['CODE'])
# several not matched should be removed (by nan values)

# IT - USE
it_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'IT']
it_dt = ch4_2[ch4_2.cntry_dbtr == 'IT'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
# match by first two characters
it_pstl['trim'] = it_pstl.CODE.str[0:2]
it_dt['trim'] = it_dt.pstl_cd.str[0:2]
it_dt = it_dt.merge(it_pstl[['NUTS3', 'trim']].drop_duplicates(), how='left', on=['trim'])
# 1 duplicate - drop the observation
# where trim=61


# LT - USE
lt_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'LT']
lt_dt = ch4_2[ch4_2.cntry_dbtr == 'LT'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
# LT should be removed from the postal code and then mapped
lt_dt['trim'] = lt_dt.pstl_cd.str[3:]
lt_dt = lt_dt.merge(lt_pstl, how='left', left_on=['trim'], right_on=['CODE'])
# the majority were not mapped
# nevertheless, another logic is not possible, therefore discard nan values

# DE - DISCARD
de_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'DE']
de_dt = ch4_2[ch4_2.cntry_dbtr == 'DE'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
# there is no apparent way to combine the codes, they are missing and no logic emerges
# therefore, exclude these?

# PT - DISCARD
pt_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'PT']
pt_dt = ch4_2[ch4_2.cntry_dbtr == 'PT'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
pt_pstl['trim'] = pt_pstl.CODE.str[0:2]
pt_pstl[['NUTS3', 'trim']].drop_duplicates()
# postal codes are missing and matching needs manual work
# discard these?

# SK - DISCARD
sk_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'SK']
sk_dt = ch4_2[ch4_2.cntry_dbtr == 'SK'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
sk_pstl['trim'] = sk_pstl.CODE.str[0:2]
sk_pstl[['NUTS3', 'trim']].drop_duplicates()
# postal codes are missing and matching needs manual work
# discard?

# MT - USE
mt_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'MT']
mt_dt = ch4_2[ch4_2.cntry_dbtr == 'MT'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
# postal codes include numeric parts that need to be removed and afterwards matched
mt_dt['trim'] = mt_dt.pstl_cd.str[0:3]
mt_dt = mt_dt.merge(mt_pstl, how='left', left_on=['trim'], right_on=['CODE'])

# NL - DISCARD
nl_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'NL']
nl_dt = ch4_2[ch4_2.cntry_dbtr == 'NL'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
nl_pstl['trim'] = nl_pstl.CODE.str[0:2]
nl_pstl[['NUTS3', 'trim']].drop_duplicates()
# codes are missing, discard them? (no stable logic for manual replacement, but possibly check manually)

# GR - DISCARD
gr_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'EL']
gr_dt = ch4_2[ch4_2.cntry_dbtr == 'GR'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
gr_pstl['trim'] = gr_pstl.CODE.str[0:2]
gr_pstl[['NUTS3', 'trim']].drop_duplicates()
# discard? the codes are missing and no logic present for manual replacement

# FI
fi_pstl = codes_eu[codes_eu.NUTS3.str[0:2] == 'FI']
fi_dt = ch4_2[ch4_2.cntry_dbtr == 'FI'][['cntry_dbtr', 'dbtr_id', 'pstl_cd']].drop_duplicates().copy()
# code is missing, but possibly substitute it?
# the code appears in more recent dates of reference
fi_dt['NUTS3'] = 'FI1C1'

# concatenate files
cntr_concat = pd.concat([lu_dt[['cntry_dbtr', 'dbtr_id', 'NUTS3']], 
                         es_dt[(es_dt.dupl == 1) & (es_dt.NUTS3.isna() == False)][['cntry_dbtr', 'dbtr_id', 'NUTS3']], 
                         fr_dt[(fr_dt.dupl == 1) & (fr_dt.NUTS3.isna() == False)][['cntry_dbtr', 'dbtr_id', 'NUTS3']], 
                         cy_dt[['cntry_dbtr', 'dbtr_id', 'NUTS3']], 
                         ie_dt[ie_dt.NUTS3.isna() == False][['cntry_dbtr', 'dbtr_id', 'NUTS3']], 
                         it_dt[it_dt.trim != '61'][['cntry_dbtr', 'dbtr_id', 'NUTS3']], 
                         lt_dt[lt_dt.NUTS3.isna() == False][['cntry_dbtr', 'dbtr_id', 'NUTS3']], 
                         mt_dt[['cntry_dbtr', 'dbtr_id', 'NUTS3']], 
                         fi_dt[['cntry_dbtr', 'dbtr_id', 'NUTS3']]], ignore_index=True)

# count matched firms
cntr_concat.groupby(['cntry_dbtr'])['dbtr_id'].count()  
# around 30% matched
  
## merge with main file
dd_merged = dd_merged.merge(cntr_concat, how='left', on=['cntry_dbtr', 'dbtr_id'])
dd_merged.rename(columns={'NUTS3_x': 'NUTS3', 'NUTS3_y': 'NUTS3_matched'}, inplace=True)

# insert NUTS3 codes in the final column
dd_merged.loc[dd_merged.nuts3_dbtr_final.isna() == True, 'nuts3_dbtr_final'] = dd_merged['NUTS3_matched']
dd_merged.nuts3_dbtr_final.isna().sum()

## OUTDATED NUTS CODES IN RIAD
ch_all = dd_merged[(dd_merged.nuts3_dbtr_final != '-99') & (dd_merged.nuts3_dbtr_final.isna() != True)].copy()
ch_all.drop(columns=['ona_dbtr','outst_amnt_ttl','crdtr_id','outst_amnt_per_crdtr','NUTS3','NUTS3_matched'], inplace=True)
ch_all['pstl_cd'] = ch_all['pstl_cd'].str.replace(" ", "")
ch_all['pstl_cd'] = ch_all['pstl_cd'].str.replace("-", "")
# remove missing nace codes
ch_all.drop(ch_all[ch_all.nace_code == ''].index, inplace=True) # no such
ch_all.isna().sum()
# first check if the country matches the nuts3 code
ch_all_c = ch_all[ch_all.cntry_dbtr != ch_all.nuts3_dbtr_final.str[0:2]]
# only 1 case, the FR one
# check if the RIAD NUTS3 codes equal the external ones
ch_all_m = ch_all.merge(codes_eu, how='left', left_on=['pstl_cd'], right_on=['CODE'])
ch_all_m.isna().sum()
# this generates duplicates (observations almost double)
ch_all_cl = ch_all_m[ch_all_m.cntry_dbtr == ch_all_m.NUTS3.str[0:2]]
# around 100k obs are removed
# check if the RIAD code and the external match
ch_all_c2 = ch_all_cl[ch_all_cl.nuts3_dbtr_final.str[0:2] != ch_all_cl.NUTS3.str[0:2]]
# again, only FR case appears
# check matches on NUTS1 level
ch_all_c3 = ch_all_cl[ch_all_cl.nuts3_dbtr_final.str[0:3] != ch_all_cl.NUTS3.str[0:3]]
# how many cases are these on debtor level?
ch_all_c3.dbtr_id.nunique() # 13944
# check by country
ch_all_c3.groupby(['cntry_dbtr'])['nuts3_dbtr_final'].nunique()
# construct a matrix to check where would the observations go
dt_m = ch_all_c3[['cntry_dbtr','dbtr_id','nuts3_dbtr_final','NUTS3']].drop_duplicates()
# do the analysis on NUTS1 level
dt_m['NUTS1'] = dt_m.NUTS3.str[0:3]
dt_m['nuts1_dbtr'] = dt_m.nuts3_dbtr_final.str[0:3]
gr_m = dt_m.groupby(['nuts1_dbtr','NUTS1'])['dbtr_id'].count().reset_index()
bycntry = []
for i in gr_m.NUTS1.str[0:2].unique():
    bycntry.append(gr_m[gr_m.NUTS1.str[0:2] == i].pivot_table(columns=['NUTS1'], index=['nuts1_dbtr'], values=['dbtr_id']))
    
bycntry_keys = []
for i in gr_m.NUTS1.str[0:2].unique():
    bycntry_keys.append(i)
    
# create dictionary   
bycntry_dict = dict(zip(bycntry_keys, bycntry))
# export    
writer=pd.ExcelWriter(r"summary_nuts3.xlsx")
for key in bycntry_keys:
    bycntry_dict[key].to_excel(writer,sheet_name="{}".format(key))

writer.save()

# count by country
dt_m.groupby(['cntry_dbtr'])['dbtr_id'].count()


cch = codes_eu[codes_eu.NUTS3.str[0:2] == 'FR']
# these are the updated codes

# merge ch_all_c3 with the main dataset and replace the final codes
dd_merged = dd_merged.merge(ch_all_c3[['cntry_dbtr', 'dbtr_id', 'sz_f', 'nace_code', 'NUTS3']].drop_duplicates(), how='left', on=['cntry_dbtr', 'dbtr_id', 'sz_f', 'nace_code'])

# update the nuts3 final column
dd_merged.isna().sum()
dd_merged.loc[dd_merged.NUTS3_y.isna() == False, 'nuts3_dbtr_final'] = dd_merged['NUTS3_y']
# we can check these with the nuts3_dbtr column
# check FR
dd_merged[dd_merged.cntry_dbtr == 'FR']['nuts3_dbtr_final'].str[0:3].unique()
# there are still old codes 
ch_fr = dd_merged[dd_merged.nuts3_dbtr_final.str[0:3].isin(['FR2','FR3','FR4','FR5','FR6','FR7','FR8', 'FRA', 'FRZ'])].copy()
# these should be removed from the analysis because it is too expensive to match them using other means
ch_fr.dbtr_id.nunique()
# 227 firms

# check all codes in the final table
for i in dd_merged.cntry_dbtr.unique():
    print('dd_merged:')
    print(dd_merged[dd_merged.cntry_dbtr == i]['nuts3_dbtr_final'].str[0:3].unique())
    print('codes_eu:')
    print(codes_eu[codes_eu.NUTS3.str[0:2] == i]['NUTS3'].str[0:3].unique())

# ESZ??
# DEZ??
# ATZ??
# LUZ??
# GRZ??
# IEZ?? -> IE0
# CYZ?? -> CY0    
# SKZ?? -> SK0
# MTZ?? -> MT0
# cannot use other codes, because we cannot infer the exact coresponding code
    
# put to missing
# it seems that the ZZZ is used to flag missing information
ch_zzz = dd_merged[dd_merged.nuts3_dbtr_final.isna() == False]  
ch_zzz = ch_zzz[ch_zzz.nuts3_dbtr_final.str.contains('ZZZ')] 
# check by country    
ch_zzz.groupby(['cntry_dbtr'])['dbtr_id'].nunique()
#AT     436
#CY    2477
#DE      16
#ES       3
#GR      79
#IE       6
#LU       5
#MT       3
#SK       2

# apply rules
# FR
dd_merged.loc[dd_merged.nuts3_dbtr_final.str[0:3].isin(['FR2','FR3','FR4','FR5','FR6','FR7','FR8', 'FRA', 'FRZ']) == True, 'nuts3_dbtr_final'] = dd_merged['NUTS3_y']
# all other
dd_merged.loc[dd_merged.nuts3_dbtr_final.str.contains('ZZZ') == True, 'nuts3_dbtr_final'] = dd_merged['NUTS3_y']


# get also a final column of NUTS2 and NUTS1
dd_merged['nuts2_dbtr_final'] = dd_merged['nuts3_dbtr_final'].str[0:4]
dd_merged['nuts1_dbtr_final'] = dd_merged['nuts3_dbtr_final'].str[0:3]

# retain only final information and exclude missing nace code
dd_final = dd_merged[(dd_merged.nace_code != '') & (dd_merged.nuts3_dbtr_final.isna() == False)][['cntry_dbtr', 'dbtr_id', 'sz_f', 'nace_code', 'ona_dbtr',
                                            'outst_amnt_ttl', 'crdtr_id', 'outst_amnt_per_crdtr',
                                            'nuts3_dbtr_final', 'nuts2_dbtr_final', 'nuts1_dbtr_final']].copy()
dd_final.isna().sum()
# the null ONA per creditor should not alter the analysis

# save the dataset
dd_final.to_stata(path_data + r'\dt_dbtr_crdtr_nuts3.dta', write_index=False)

# sum ona by creditor and nuts code
ona_crdtr_ttl_nuts3 = dd_final.groupby(['cntry_dbtr','crdtr_id','nuts3_dbtr_final'])['outst_amnt_per_crdtr'].sum().reset_index()
ona_crdtr_ttl_nuts3.rename(columns={'outst_amnt_per_crdtr':'ona_crdtr_ttl_nuts3'}, inplace=True)
ona_crdtr_ttl_nuts2 = dd_final.groupby(['cntry_dbtr','crdtr_id','nuts2_dbtr_final'])['outst_amnt_per_crdtr'].sum().reset_index()
ona_crdtr_ttl_nuts2.rename(columns={'outst_amnt_per_crdtr':'ona_crdtr_ttl_nuts2'}, inplace=True)
ona_crdtr_ttl_nuts1 = dd_final.groupby(['cntry_dbtr','crdtr_id','nuts1_dbtr_final'])['outst_amnt_per_crdtr'].sum().reset_index()
ona_crdtr_ttl_nuts1.rename(columns={'outst_amnt_per_crdtr':'ona_crdtr_ttl_nuts1'}, inplace=True)
# merge with main dataset
dd_final = dd_final.merge(ona_crdtr_ttl_nuts3, how='left', on=['cntry_dbtr','crdtr_id','nuts3_dbtr_final'])
dd_final = dd_final.merge(ona_crdtr_ttl_nuts2, how='left', on=['cntry_dbtr','crdtr_id','nuts2_dbtr_final'])
dd_final = dd_final.merge(ona_crdtr_ttl_nuts1, how='left', on=['cntry_dbtr','crdtr_id','nuts1_dbtr_final'])

## calculate market shares
# sum creditors' ONAs in a region and then calculate the market share
ona_crdtr = dd_final[['cntry_dbtr', 'crdtr_id', 'nuts3_dbtr_final', 'nuts2_dbtr_final', 'nuts1_dbtr_final', 'ona_crdtr_ttl_nuts3', 'ona_crdtr_ttl_nuts2', 'ona_crdtr_ttl_nuts1']].drop_duplicates().copy()
# calculate total ona per country and region
ttl_ona_nuts3 = ona_crdtr.groupby(['cntry_dbtr', 'nuts3_dbtr_final'])['ona_crdtr_ttl_nuts3'].sum().reset_index()
ttl_ona_nuts2 = ona_crdtr[['cntry_dbtr', 'nuts2_dbtr_final','ona_crdtr_ttl_nuts2']].drop_duplicates().groupby(['cntry_dbtr', 'nuts2_dbtr_final'])['ona_crdtr_ttl_nuts2'].sum().reset_index()
ttl_ona_nuts1 = ona_crdtr[['cntry_dbtr', 'nuts1_dbtr_final','ona_crdtr_ttl_nuts1']].drop_duplicates().groupby(['cntry_dbtr', 'nuts1_dbtr_final'])['ona_crdtr_ttl_nuts1'].sum().reset_index()
ttl_ona_nuts3.rename(columns={'ona_crdtr_ttl_nuts3':'ttl_ona_nuts3'}, inplace=True)
ttl_ona_nuts2.rename(columns={'ona_crdtr_ttl_nuts2':'ttl_ona_nuts2'}, inplace=True)
ttl_ona_nuts1.rename(columns={'ona_crdtr_ttl_nuts1':'ttl_ona_nuts1'}, inplace=True)
ona_crdtr = ona_crdtr.merge(ttl_ona_nuts3, how='left', on=['cntry_dbtr', 'nuts3_dbtr_final'])
ona_crdtr = ona_crdtr.merge(ttl_ona_nuts2, how='left', on=['cntry_dbtr', 'nuts2_dbtr_final'])
ona_crdtr = ona_crdtr.merge(ttl_ona_nuts1, how='left', on=['cntry_dbtr', 'nuts1_dbtr_final'])
# market shares
ona_crdtr['mrkt_shr_nuts3'] = ona_crdtr['ona_crdtr_ttl_nuts3']/ona_crdtr['ttl_ona_nuts3']
ona_crdtr['mrkt_shr_nuts2'] = ona_crdtr['ona_crdtr_ttl_nuts2']/ona_crdtr['ttl_ona_nuts2']
ona_crdtr['mrkt_shr_nuts1'] = ona_crdtr['ona_crdtr_ttl_nuts1']/ona_crdtr['ttl_ona_nuts1']
ona_crdtr['mrkt_shr_nuts3'].describe()
ona_crdtr['mrkt_shr_nuts2'].describe()
ona_crdtr['mrkt_shr_nuts1'].describe()
# market shares have values of 0?
ona_crdtr.groupby(['cntry_dbtr'])['crdtr_id'].count()
xx = ona_crdtr[ona_crdtr.cntry_dbtr == 'MT']
# calculate HHI on bank level
# square market shares
ona_crdtr['mrkt_shr_nuts3_sq'] = ona_crdtr['mrkt_shr_nuts3']*ona_crdtr['mrkt_shr_nuts3']
ona_crdtr['mrkt_shr_nuts2_sq'] = ona_crdtr['mrkt_shr_nuts2']*ona_crdtr['mrkt_shr_nuts2']
ona_crdtr['mrkt_shr_nuts1_sq'] = ona_crdtr['mrkt_shr_nuts1']*ona_crdtr['mrkt_shr_nuts1']
# sum by region
hhi_nuts3 = ona_crdtr.groupby(['cntry_dbtr', 'nuts3_dbtr_final'])['mrkt_shr_nuts3_sq'].sum().reset_index()
hhi_nuts2 = ona_crdtr[['cntry_dbtr', 'crdtr_id', 'nuts2_dbtr_final', 'mrkt_shr_nuts2_sq']].drop_duplicates().groupby(['cntry_dbtr', 'nuts2_dbtr_final'])['mrkt_shr_nuts2_sq'].sum().reset_index()
hhi_nuts1 = ona_crdtr[['cntry_dbtr', 'crdtr_id', 'nuts1_dbtr_final', 'mrkt_shr_nuts1_sq']].drop_duplicates().groupby(['cntry_dbtr', 'nuts1_dbtr_final'])['mrkt_shr_nuts1_sq'].sum().reset_index()
hhi_nuts3.rename(columns={'mrkt_shr_nuts3_sq':'hhi_nuts3'}, inplace=True)
hhi_nuts2.rename(columns={'mrkt_shr_nuts2_sq':'hhi_nuts2'}, inplace=True)
hhi_nuts1.rename(columns={'mrkt_shr_nuts1_sq':'hhi_nuts1'}, inplace=True)
# take the average HHI on country level
avg_hhi_nuts3 = hhi_nuts3.groupby(['cntry_dbtr'])['hhi_nuts3'].mean().reset_index()
avg_hhi_nuts2 = hhi_nuts2.groupby(['cntry_dbtr'])['hhi_nuts2'].mean().reset_index()
avg_hhi_nuts1 = hhi_nuts1.groupby(['cntry_dbtr'])['hhi_nuts1'].mean().reset_index()
avg_hhi_nuts3.rename(columns={'hhi_nuts3':'hhi_nuts3_avg'}, inplace=True)
avg_hhi_nuts2.rename(columns={'hhi_nuts2':'hhi_nuts2_avg'}, inplace=True)
avg_hhi_nuts1.rename(columns={'hhi_nuts1':'hhi_nuts1_avg'}, inplace=True)

# export data for plotting
hhi_nuts3.to_stata(path_data + r'\hhi_nuts3.dta', write_index=False)
hhi_nuts2.to_stata(path_data + r'\hhi_nuts2.dta', write_index=False)
hhi_nuts1.to_stata(path_data + r'\hhi_nuts1.dta', write_index=False)

# associate the debtors, regions and HHI values
dd_dbtr_lvl = dd_final[['cntry_dbtr', 'dbtr_id', 'sz_f', 'nace_code', 'ona_dbtr', 'nuts3_dbtr_final', 'nuts2_dbtr_final', 'nuts1_dbtr_final']].drop_duplicates().copy()
dd_dbtr_lvl = dd_dbtr_lvl.merge(hhi_nuts3, how='left', on=['cntry_dbtr', 'nuts3_dbtr_final'])
dd_dbtr_lvl = dd_dbtr_lvl.merge(hhi_nuts2, how='left', on=['cntry_dbtr', 'nuts2_dbtr_final'])
dd_dbtr_lvl = dd_dbtr_lvl.merge(hhi_nuts1, how='left', on=['cntry_dbtr', 'nuts1_dbtr_final'])
dd_dbtr_lvl.isna().sum()

# import debtor level dataset
AC_final = pd.read_stata('dbtr_lvl_dt.dta')
AC_final.drop(AC_final[AC_final.nace_code == ''].index, inplace=True)

# merge
AC_final = AC_final.merge(dd_dbtr_lvl, how='left', on=['cntry_dbtr', 'dbtr_id'])
AC_final.isna().sum()
AC_final.drop(columns=['sz_f_y', 'nace_code_y', 'ona_dbtr_y'], inplace=True)
AC_final.rename(columns={'sz_f_x':'sz_f', 'nace_code_x':'nace_code', 'ona_dbtr_x':'ona_dbtr'}, inplace=True)

# export data for regression analysis
AC_final.to_stata(path_data + r'\dbtr_lvl_dt_nuts.dta', write_index=False)





